#1. Load Required Packages
required_packages <- c("readxl", "dplyr", "nortest")
for (package in required_packages) {
  if (!requireNamespace(package, quietly = TRUE)) {
    install.packages(package)
  }
  library(package, character.only = TRUE)
}
#Description: This section ensures the required packages (readxl, dplyr, nortest) are installed and loaded.


#2. Import Data Frame
if (!exists("df")) {
  file_path <- "~/COHORT_Prediction_of_CIP.xlsx"
  df <- read_excel(file_path, sheet = "cohort", na = "#N/A")
}
#Description: This section imports data from an Excel file if the data frame df does not already exist.


#3. Filter Data
df_filtered_distance <- subset(df, pet_distance_ici <= 365 | is.na(pet_distance_ici))
df_filtered_distance <- subset(df_filtered_distance, pet_distance_ici >= -1)
df_filtered <- subset(df_filtered_distance, pet_impossible_spheres_placement == 0)
#Description: This section filters the data to include only records where distance between PET/CT and immunotherapy is between -1 and 365 days.


#4. Output Text Information
print (paste("A total of ",count(df), "patients with lung cancer underwent ICI therapy during the observation period."))
print (paste(count(df)-count(df_filtered_distance), "patients were excluded because the staging with FDG-PET/CT was acquired after therapy with immune checkpoint inhibitors was started"))
print (paste("In",count(df_filtered_distance)-count(df_filtered), "patient, the placement of spheres in non-tumorous lung tissue was not feasible due to a prior lung operation and extensive tumor infiltration"))
print (paste("The baseline characteristics of the remaining",count(df_filtered), "patients are presented in Table 1."))
#Description: This section outputs various summary statistics and counts from the filtered data.


#5. Define Function to Run Numerical Tests
run_num_tests <- function(var) {
  lillie_test <- lillie.test(df_filtered[[var]])
  if (lillie_test$p.value < 0.01) {
    test <- wilcox.test(df_filtered[[var]] ~ pneumonitis, data = df_filtered)
  } else {
    test <- t.test(df_filtered[[var]] ~ pneumonitis, data = df_filtered)
  }
  
  summary_df_filtered <- df_filtered %>% 
    group_by(pneumonitis) %>%
    summarise(
      mean = mean(.data[[var]], na.rm = TRUE),
      sd = sd(.data[[var]], na.rm = TRUE),
      .groups = 'keep'
    )
  if (test$p.value < 0.001) {
    significance <- "***"
  } else if (test$p.value < 0.01) {
    significance <- "**"
  } else if (test$p.value < 0.05) {
    significance <- "*"
  } else if (test$p.value < 0.1) {
    significance <- "."
  } else {
    significance <- ""
  }
  
  var_count <- sum(!is.na(df_filtered[[var]]))
  var_mean <- round(mean(df_filtered[[var]], na.rm = TRUE), 1)
  var_sd <- round(sd(df_filtered[[var]], na.rm = TRUE), 1)
  results[i,] <- c(var[i], var_mean, var_sd)
  
  results <- data.frame(
    variable = var,
    all = paste0(var_mean," (±", var_sd, ")"),
    noPneumonitis = paste0(round(summary_df_filtered$mean[1], 1)," (±", round(summary_df_filtered$sd[1], 1), ")"),
    Pneumonitis = paste0(round(summary_df_filtered$mean[2], 1)," (±",round(summary_df_filtered$sd[2], 1), ")"),
    p_value = round(test$p.value, 3),
    significance = significance
  )
  return(results)
}
#Description: This function runs numerical tests (t-test or Wilcoxon test) depending on the normality of the data, then calculates summary statistics and returns the results.


#6. Define Function to Run Binary Tests
run_bi_tests <- function(var) {
  if(all(c(0,1) %in% df_filtered[[var]])) {
    tbl <- table(df_filtered[[var]], df_filtered$pneumonitis)
    if (min(tbl) < 11) {
      test <- fisher.test(tbl)
    } else {
      test <- chisq.test(tbl)
    }
    if (test$p.value < 0.001) {
      significance <- "***"
    } else if (test$p.value < 0.01) {
      significance <- "**"
    } else if (test$p.value < 0.05) {
      significance <- "*"
    } else if (test$p.value < 0.1) {
      significance <- "."
    } else {
      significance <- ""
    }
  
    var_count_ones <- sum(df_filtered[[var]] == 1, na.rm = TRUE)
    var_percent <- round(var_count_ones / var_count * 100, 1)
    if (variables[i] == "nicotine_female") {
      var_percent <- round(var_count_ones / sum(df_filtered$female == 1, na.rm = TRUE) * 100, 1)
    }
    if (variables[i] == "nicotine_male") {
      var_percent <- round(var_count_ones / sum(df_filtered$male == 1, na.rm = TRUE) * 100, 1)
    }
    
    results <- data.frame(
      variable <- var,
      all = paste0(var_count_ones, "/",sum(tbl)," (", var_percent, "%)"),
      noPneumonitis = paste0(round(tbl[2,1], 0), "/",sum(tbl[,1])," (", round(tbl[2,1]/sum(tbl[,1])*100, 1), "%)"),
      Pneumonitis = paste0(round(tbl[2,2], 0), "/",sum(tbl[,2])," (", round(tbl[2,2]/sum(tbl[,2])*100, 1), "%)"),
      p_value = round(test$p.value, 3),
      significance = significance
    )
  } else {
    results <- NULL
  }
  return(results)
}
#Description: This function runs binary tests (Fisher’s exact test or Chi-square test) and returns summary statistics along with the significance level.


#7. Define Function to Insert Row in Results
insert_row <- function(var, row){
  all <- ""
  noPneumonitis <- ""
  Pneumonitis <- ""
  p_value <- ""
  significance <- ""
if (var == "count") {
    tbl <- table(df_filtered$pneumonitis)
    var <- ""
    all <- paste0("n = ", sum(tbl), " (", paste0(round(sum(tbl) / sum(tbl) * 100, 2), "%"), ")")
    noPneumonitis <- paste0("n = ", tbl[1], " (", paste0(round(tbl[1] / sum(tbl) * 100, 2), "%"), ")")
    Pneumonitis <- paste0("n = ", tbl[2]," (", paste0(round(tbl[2]/sum(tbl)*100, 2), "%"), ")")
    p_value <- ""
    significance <- ""
  }
  new_row <- data.frame(Group = var, all, noPneumonitis, Pneumonitis, p_value, significance)
  results <<- rbind(results[0:row,], new_row, results[(row+1):nrow(results),])
}
#Description: This function inserts a new row into the results data frame, based on the given variable and row position.

#8. Data Preparation and Variables Setup
count <- table(df_filtered$pet_device)
sorted <- sort(count, decreasing = TRUE)
top5 <- names(sorted)[1:5]
df_filtered <- df_filtered %>%
  mutate(
    staging_I = if_else(staging == 1, 1, 0),
    staging_II = if_else(staging == 2, 1, 0),
    staging_III = if_else(staging == 3, 1, 0),
    staging_IV = if_else(staging == 4, 1, 0),
    male = ifelse(sex_male == 1, 1, 0),
    female = ifelse(sex_male == 0, 1, 0),
    SiemensBiopraph20 = ifelse(pet_device == 1, 1, 0),
    SiemensBiopraph40 = ifelse(pet_device == 2, 1, 0),
    SiemensBiopraph64 = ifelse(pet_device == 3, 1, 0),
    GEDiscovery690 = ifelse(pet_device == 6, 1, 0),
    PhilipsGuardianBody = ifelse(pet_device == 8, 1, 0),
    otherDevices = ifelse(!pet_device %in% top5, 1, 0),
  )
#Description: Prepares the data for analysis by creating new binary columns for staging, sex, and PET/CT devices.


#9. Loop Through Variables and Run Tests
variables <- c("female", "male", "age", "bmi", "nicotine", "packyears", "blood_hb", 
                "staging_I","staging_II","staging_III","staging_IV",
                "histo_nsclc_adeno", "histo_nsclc_squamous", "histo_sclc_neuroendocrine","histo_others", 
                "op_before_pet", "radio_thorax","radio_before_pet","radio_after_pet","chemo", 
                "pneu_before_pembro", "pneu_before_nivo", "pneu_before_atezo", 
                "pneu_before_other", "pneu_before_combi_ipi_nivo", "pneu_before_multiple",
                "copd", "asthma", "pericardial_effusion", "pleural_effusion", "diabetes", "chd", 
               "SiemensBiopraph20","SiemensBiopraph40","SiemensBiopraph64","GEDiscovery690", "PhilipsGuardianBody","otherDevices")
df_filtered$pneu_before_nivo[df_filtered$pneu_before_combi_ipi_nivo == 1] <- 0

# Initialize Results Data Frame
results <- data.frame(Group = character(), all  = integer(),
                      noPneumonitis = integer(), Pneumonitis = integer(), 
                      p_value = numeric(), significance = character())

for(i in 1:length(variables)){
  var_count <- sum(!is.na(df_filtered[[variables[i]]]))
  if(sum(df_filtered[[variables[i]]] %in% c(0, 1), na.rm = TRUE) == var_count){
    results[i,] <- run_bi_tests(variables[i])
  } else if(var_count > 0){
    results[i,] <- run_num_tests(variables[i])
  } else {
    results[i,] <- c(variables[i], "NA", "NA")
  }
}
#Runs statistical tests on specified variables. Depending on whether the variable is binary or numerical, it calls the appropriate function (run_bi_tests or run_num_tests). The results are then stored in the results data frame.


#10. Format Results Data Frame
results$Group[results$Group == "female"] <- "Female"
results$Group[results$Group == "male"] <- "Male"
results$Group[results$Group == "age"] <- "Age [y]"
results$Group[results$Group == "bmi"] <- "BMI [kg/m²]"
results$Group[results$Group == "nicotine"] <- "Nicotine consumption"
results$Group[results$Group == "packyears"] <- "Packyears [y]"
results$Group[results$Group == "blood_hb"] <- "Hb [g/dL]"
results$Group[results$Group == "staging_I"] <- "I"
results$Group[results$Group == "staging_II"] <- "II"
results$Group[results$Group == "staging_III"] <- "III"
results$Group[results$Group == "staging_IV"] <- "IV"
results$Group[results$Group == "histo_nsclc_adeno"] <- "NSCLC adenocarcinoma"
results$Group[results$Group == "histo_nsclc_squamous"] <- "NSCLC squamous cell carcinoma"
results$Group[results$Group == "histo_sclc_neuroendocrine"] <- "SCLC neuroendocrine carcinoma"
results$Group[results$Group == "histo_others"] <- "Other lung cancer histology1"
results$Group[results$Group == "op_before_pet"] <- "Lung operation"
results$Group[results$Group == "radio_thorax"] <- "Thorax radiation"
results$Group[results$Group == "radio_after_pet"] <- "- Thorax radiation after PET/CT"
results$Group[results$Group == "radio_before_pet"] <- "- Thorax radiation before PET/CT"
results$Group[results$Group == "chemo"] <- "Combination with Chemotherapy"
results$Group[results$Group == "pneu_before_nivo"] <- "Nivolumab"
results$Group[results$Group == "pneu_before_pembro"] <- "Pembrolizumab"
results$Group[results$Group == "pneu_before_atezo"] <- "Atezolizumab"
results$Group[results$Group == "pneu_before_other"] <- "Others2"
results$Group[results$Group == "pneu_before_combi_ipi_nivo"] <- "Nivolumab and Ipilimumab"
results$Group[results$Group == "pneu_before_multiple"] <- "Subsequent therapy with ≥2 ICIs3"
results$Group[results$Group == "copd"] <- "COPD"
results$Group[results$Group == "asthma"] <- "Asthma bronchial"
results$Group[results$Group == "emphysema"] <- "Lung emphysema"
results$Group[results$Group == "pericardial_effusion"] <- "Pericardial effusion"
results$Group[results$Group == "pleural_effusion"] <- "Pleural effusion"
results$Group[results$Group == "diabetes"] <- "Diabetes mellitus type II"
results$Group[results$Group == "chd"] <- "Coronary heart disease"
results$Group[results$Group == "pet_distance_ici"] <- "Interval between PET/CT and ICI therapy [d]"
results$Group[results$Group == "SiemensBiopraph20"] <- "Siemens Biograph 20"
results$Group[results$Group == "SiemensBiopraph40"] <- "Siemens Biograph 40"
results$Group[results$Group == "SiemensBiopraph64"] <- "Siemens Biograph 64"
results$Group[results$Group == "GEDiscovery690"] <- "GE Discovery 690"
results$Group[results$Group == "PhilipsGuardianBody"] <- "Philips Guardian Body"
results$Group[results$Group == "otherDevices"] <- "Other scanners4"
#Description: Renames the columns of the results data frame to more descriptive labels.


#11. Insert Rows
insert_row("count", 0)
insert_row("Physical characteristics", 1)
insert_row("Sex", 2)
insert_row("Smoking history", 7)
insert_row("Blood count", 10)
insert_row("Tumor", 12)
insert_row("Staging", 13)
insert_row("Histology", 18)
insert_row("Clinical history", 23)
insert_row("Immune checkpoint inhibitors", 29)
insert_row("Comorbidities", 36)
insert_row("PET/CT Scanner", 43)
#Description: Adds rows to the `results` data frame to structure the output into meaningful sections. 


#12. Save Results
print(results)
write.csv(results, file = "Table 1.csv", row.names = FALSE)
print("saved: Table 1.csv")
#Saves the final table to a CSV file.


#13. Print Additional Notes
print (paste("Other immune checkpoint inhibitors: Durvalumab (n =", sum(df_filtered$pneu_before_durva == 1), 
             "), Dostarlimab (n =", sum(df_filtered$pneu_before_dostar == 1), 
             "), single Ipilimumab (n =", sum(df_filtered$pneu_before_ipi == 1),")."))

print (paste("Other lung cancer histology: poorly differentiated lung carcinom (n =", sum(df_filtered$histo_poorly_diff == 1), 
             "), adenosquamous lung carcinoma (n =", sum(df_filtered$histo_nsclc_adenosquamoes == 1), 
             "), neuroendocrine tumor (n =", sum(df_filtered$histo_nsclc_neuroendocrine == 1), 
             "), mucoepidermoid carcinoma (n =", sum(df_filtered$histo_nsclc_mucoepidermoid == 1), 
             "), pleomorphic carcinoma (n =", sum(df_filtered$histo_nsclc_pleomorph == 1), 
             "), sarcomatoid carcinoma (n =",sum(df_filtered$histo_nsclc_sarkomatoid == 1),")."))

print (paste("Other scanners: Siemens Biograph 128 (n =", sum(df_filtered$pet_device == 4), 
             "), Siemens 1094 (n =", sum(df_filtered$pet_device == 5), 
             "), Philips Medical Systems GEMINI TF Big Bore (n =", sum(df_filtered$pet_device == 7), 
             "), Philips Vereos (n =", sum(df_filtered$pet_device == 9), 
             "), Philips Medical Systems GEMINI TF TO F16 (n =", sum(df_filtered$pet_device == 10), 
             "), GE MEDICAL SYSTEMS Discovery 600 (n =",sum(df_filtered$pet_device == 12),
             "), CPS 1024 (n =",sum(df_filtered$pet_device == 13),")."))
#Description: Prints additional notes about specific histologies and PET/CT scanners used.

